Here we will perform a quality control (QC) of the samples obtained from Newcastle.
library(Seurat)
library(scDblFinder)
library(tidyverse)
library(readxl)
library(ggpubr)
library(here)
library(glue)
library(DT)
library(harmony)
# Functions
source(here::here("scRNA-seq/bin/utils.R"))
# Thresholds
min_lib_size_frozen <- 1000
min_lib_size_fresh <- 850
min_n_genes_frozen <- 450
min_n_genes_fresh <- 400
max_pct_mt <- 17.5
min_cells <- 5
meatadata_newcastle <- read_excel(
here("data/Newcastle_samples/Tonsil_cell_atlas_metadata.xlsx"),
col_names = TRUE
)
donor_metadata <- read_csv(here("data/tonsil_atlas_donor_metadata.csv"))
DT::datatable(meatadata_newcastle)
DT::datatable(donor_metadata)
dirs <- list.dirs(here("data/Newcastle_samples"), full.names = FALSE)
dirs <- str_subset(dirs, "Results")
counts_list <- map(dirs, \(x) {
mat <- Read10X(here(glue("data/Newcastle_samples/{x}")))
mat
})
names(counts_list) <- str_remove(dirs, "_Results")
seurat_list <- map(names(counts_list), \(x) {
mat <- counts_list[[x]]
seurat_obj <- CreateSeuratObject(counts = mat)
seurat_obj$gem_id <- x
seurat_obj$donor_id_newcastle <- str_sub(x, start = 1, end = 5)
seurat_obj$preservation <- str_extract(x, "(fresh|frozen)")
seurat_obj <- RenameCells(seurat_obj, add.cell.id = x)
seurat_obj
})
names(seurat_list) <- names(counts_list)
seurat_list
## $`TIP01-1_fresh`
## An object of class Seurat
## 36601 features across 14951 samples within 1 assay
## Active assay: RNA (36601 features, 0 variable features)
##
## $`TIP01-1_frozen`
## An object of class Seurat
## 36601 features across 5256 samples within 1 assay
## Active assay: RNA (36601 features, 0 variable features)
##
## $`TIP01-2_frozen`
## An object of class Seurat
## 36601 features across 13170 samples within 1 assay
## Active assay: RNA (36601 features, 0 variable features)
##
## $`TIP02-1_fresh`
## An object of class Seurat
## 36601 features across 13557 samples within 1 assay
## Active assay: RNA (36601 features, 0 variable features)
##
## $`TIP02-1_frozen`
## An object of class Seurat
## 36601 features across 9229 samples within 1 assay
## Active assay: RNA (36601 features, 0 variable features)
##
## $`TIP02-2_fresh`
## An object of class Seurat
## 36601 features across 15335 samples within 1 assay
## Active assay: RNA (36601 features, 0 variable features)
##
## $`TIP02-2_frozen`
## An object of class Seurat
## 36601 features across 10369 samples within 1 assay
## Active assay: RNA (36601 features, 0 variable features)
##
## $`TIP03-1_fresh`
## An object of class Seurat
## 36601 features across 14435 samples within 1 assay
## Active assay: RNA (36601 features, 0 variable features)
##
## $`TIP03-1_frozen`
## An object of class Seurat
## 36601 features across 5644 samples within 1 assay
## Active assay: RNA (36601 features, 0 variable features)
##
## $`TIP03-2_fresh`
## An object of class Seurat
## 36601 features across 16425 samples within 1 assay
## Active assay: RNA (36601 features, 0 variable features)
##
## $`TIP03-2_frozen`
## An object of class Seurat
## 36601 features across 16167 samples within 1 assay
## Active assay: RNA (36601 features, 0 variable features)
##
## $`TIP04-1_fresh`
## An object of class Seurat
## 36601 features across 5563 samples within 1 assay
## Active assay: RNA (36601 features, 0 variable features)
##
## $`TIP04-1_frozen`
## An object of class Seurat
## 36601 features across 5820 samples within 1 assay
## Active assay: RNA (36601 features, 0 variable features)
##
## $`TIP04-2_fresh`
## An object of class Seurat
## 36601 features across 3508 samples within 1 assay
## Active assay: RNA (36601 features, 0 variable features)
##
## $`TIP04-2_frozen`
## An object of class Seurat
## 36601 features across 5189 samples within 1 assay
## Active assay: RNA (36601 features, 0 variable features)
rm(counts_list)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 10927220 583.6 19619800 1047.9 19619800 1047.9
## Vcells 327967216 2502.2 578039883 4410.1 577247477 4404.1
seurat <- Reduce(merge, seurat_list)
seurat
## An object of class Seurat
## 36601 features across 154618 samples within 1 assay
## Active assay: RNA (36601 features, 0 variable features)
rm(seurat_list)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 10778545 575.7 19619800 1047.9 19619800 1047.9
## Vcells 633058392 4829.9 2110084061 16098.7 2624712661 20025.0
# Add donor metadata
donor_metadata <- donor_metadata[donor_metadata$hospital == "Newcastle", ]
donor_metadata$donor_id_newcastle <- str_remove(
donor_metadata$comments,
"original id: "
)
seurat$barcode <- colnames(seurat)
new_metadata <- left_join(
seurat@meta.data,
donor_metadata,
by = "donor_id_newcastle"
)
rownames(new_metadata) <- new_metadata$barcode
seurat@meta.data <- new_metadata
# Number of detected genes
seurat@meta.data %>%
ggplot(aes(gem_id, nFeature_RNA, fill = preservation)) +
geom_boxplot(outlier.size = 0.1) +
scale_y_continuous(
limits = c(0, 8000),
breaks = c(0, 1000, 2000, 3000, 5000, 6000, 7000, 8000)
) +
ylab("Number of detected genes") +
theme_classic() +
theme(
legend.title = element_blank(),
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
# Percentage of mitochondrial expression
seurat$pct_mt <- PercentageFeatureSet(seurat, pattern = "^MT-")
seurat@meta.data %>%
ggplot(aes(gem_id, pct_mt, fill = preservation)) +
geom_boxplot(outlier.size = 0.1) +
scale_y_continuous(limits = c(0, 100)) +
ylab("% of mitochondrial expression") +
theme_classic() +
theme(
legend.title = element_blank(),
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
Library size:
seurat_list <- SplitObject(seurat, split.by = "preservation")
min_lib_sizes <- c(min_lib_size_fresh, min_lib_size_frozen)
names(min_lib_sizes) <- names(seurat_list)
for (x in names(seurat_list)) {
print(x)
lib_size_hist <- seurat_list[[x]]@meta.data %>%
plot_histogram_qc(x = "nCount_RNA", x_lab = "Library Size (log10(total UMI))") +
geom_vline(xintercept = min_lib_sizes[x], linetype = "dashed", color = "red")
lib_size_hist1 <- lib_size_hist +
scale_x_log10()
lib_size_hist2 <- lib_size_hist +
scale_x_continuous(limits = c(0, 4000)) +
xlab("Library Size (total UMI)") +
theme_pubr()
lib_size_arranged <- lib_size_hist1 + lib_size_hist2
print(lib_size_arranged)
}
## [1] "fresh"
## [1] "frozen"
Number of detected genes:
min_n_genes_l <- c(min_n_genes_fresh, min_n_genes_frozen)
names(min_n_genes_l) <- names(seurat_list)
for (x in names(seurat_list)) {
print(x)
n_genes_hist1 <- seurat_list[[x]]@meta.data %>%
plot_histogram_qc(x = "nFeature_RNA", x_lab = "Number of Detected Genes") +
geom_vline(xintercept = min_n_genes_l[x], linetype = "dashed", color = "red")
n_genes_hist2 <- n_genes_hist1 +
scale_x_continuous(limits = c(0, 2000))
n_genes_hist_arranged <- n_genes_hist1 + n_genes_hist2
print(n_genes_hist_arranged)
}
## [1] "fresh"
## [1] "frozen"
Percentage of mitochondrial expression:
for (x in names(seurat_list)) {
print(x)
pct_mt_hist <- seurat_list[[x]]@meta.data %>%
plot_histogram_qc(x = "pct_mt", x_lab = "% Mitochondrial Expression") +
geom_vline(xintercept = max_pct_mt, linetype = "dashed", color = "red") +
scale_x_continuous(limits = c(0, 100))
print(pct_mt_hist)
}
## [1] "fresh"
## [1] "frozen"
Joint QC metrics:
for (x in names(seurat_list)) {
print(x)
pct_mt_vs_lib_size <- FeatureScatter(
seurat_list[[x]],
feature1 = "nCount_RNA",
feature2 = "pct_mt",
pt.size = 0.15,
cols = rep("black", length(levels(Idents(seurat_list[[x]]))))
)
pct_mt_vs_lib_size <- pct_mt_vs_lib_size +
labs(x = "Library Size (total UMI)", y = "% Mitochondrial Expression") +
theme(legend.position = "none", plot.title = element_blank()) +
geom_vline(xintercept = min_lib_sizes[x], linetype = "dashed", color = "red") +
geom_hline(yintercept = max_pct_mt, linetype = "dashed", color = "red")
print(pct_mt_vs_lib_size)
}
## [1] "fresh"
## [1] "frozen"
Let us perform linear (PCA) and nonlinear (UMAP) dimensionality reduction to assess the batch effects between fresh and frozen samples:
rm(seurat_list)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 10956388 585.2 19619800 1047.9 19619800 1047.9
## Vcells 636376873 4855.2 2110084061 16098.7 2624712661 20025.0
seurat <- seurat %>%
NormalizeData() %>%
FindVariableFeatures() %>%
ScaleData() %>%
RunPCA() %>%
RunUMAP(dims = 1:30, reduction = "pca")
DimPlot(seurat, group.by = "preservation")
Indeed, preservation is the main source of batch effects.
Subset cells:
is_low_quality <- with(seurat@meta.data,
(preservation == "fresh" & nCount_RNA < min_lib_size_fresh) |
(preservation == "frozen" & nCount_RNA < min_lib_size_frozen) |
(preservation == "fresh" & nFeature_RNA < min_n_genes_fresh) |
(preservation == "frozen" & nFeature_RNA < min_n_genes_frozen) |
(pct_mt > max_pct_mt)
)
table(is_low_quality)
## is_low_quality
## FALSE TRUE
## 121395 33223
seurat <- seurat[, !is_low_quality]
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 11017963 588.5 19619800 1047.9 19619800 1047.9
## Vcells 1773028718 13527.2 3038697047 23183.5 2624712661 20025.0
DimPlot(seurat, group.by = "preservation")
We now assess if fresh and frozen can be integrated with Harmony:
seurat$preservation <- factor(seurat$preservation)
seurat <- seurat %>%
FindVariableFeatures() %>%
ScaleData() %>%
RunPCA() %>%
RunHarmony(group.by.vars = "preservation") %>%
RunUMAP(dims = 1:30, reduction = "harmony")
DimPlot(
seurat,
group.by = "preservation",
raster = FALSE,
split.by = "preservation"
)
According to Luecken MD et al.: “A guideline to setting this threshold is to use the minimum cell cluster size that is of interest and leaving some leeway for dropout effects. For example, filtering out genes expressed in fewer than 20 cells may make it difficult to detect cell clusters with fewer than 20 cells. For datasets with high dropout rates, this threshold may also complicate the detection of larger clusters. The choice of threshold should scale with the number of cells in the dataset and the intended downstream analysis.”
Since we want to detect rare cell types in the tonsil, we will be very permissive and retain genes that are expressed in at least 5 cells.
n_cells <- Matrix::rowSums(seurat[["RNA"]]@counts > 0)
gene_qc <- n_cells %>%
as.data.frame() %>%
ggplot(aes(n_cells)) +
geom_histogram(bins = 100, alpha = 0.75) +
scale_x_log10("Number of cells") +
theme_bw()
gene_qc +
geom_vline(xintercept = min_cells, linetype = "dashed", color = "red")
top_50_genes <- sort(n_cells, decreasing = TRUE)[1:50]
top_50_genes_df <- data.frame(
gene = names(top_50_genes),
n_cells = top_50_genes
)
top_50_genes_df %>%
ggplot(aes(fct_reorder(gene, n_cells), n_cells)) +
geom_point() +
labs(x = "", y = "Number of expressing cells") +
coord_flip()
kept_genes <- rownames(seurat)[n_cells > min_cells]
table(n_cells > min_cells)
##
## FALSE TRUE
## 10612 25989
(seurat <- subset(seurat, features = kept_genes))
## An object of class Seurat
## 25989 features across 121395 samples within 1 assay
## Active assay: RNA (25989 features, 1984 variable features)
## 3 dimensional reductions calculated: pca, umap, harmony
We use scDblFinder:
sce <- as.SingleCellExperiment(seurat)
dbl_densities <- computeDoubletDensity(
sce,
dims = 30,
subset.row = VariableFeatures(seurat)
)
summary(dbl_densities)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.1200 0.3800 0.9474 0.9600 38.4200
seurat$doublet_score <- dbl_densities
FeaturePlot(
seurat,
features = "doublet_score",
raster = FALSE
) &
scale_color_viridis_c(option = "magma")
FeaturePlot(
seurat,
features = c("CD79A", "CD3D"),
raster = FALSE
) &
scale_color_viridis_c(option = "magma")
FeatureScatter(
seurat,
feature1 = "CD79A",
feature2 = "CD3D",
group.by = "preservation"
) +
geom_vline(xintercept = 1) +
geom_hline(yintercept = 1)
seurat$is_doublet_B_T <- ifelse(
((seurat[["RNA"]]@data["CD79A", ] > 1) & (seurat[["RNA"]]@data["CD3D", ] > 1)),
"doublet_B_T",
"not_doublet_B_T"
)
DimPlot(seurat, group.by = "is_doublet_B_T", raster = FALSE)
seurat@meta.data %>%
ggplot(aes(doublet_score)) +
geom_histogram(bins = 100) +
theme_pubr()
seurat$is_doublet <- seurat$doublet_score > 5
DimPlot(seurat, group.by = "is_doublet", raster = FALSE)
(seurat <- seurat[, !seurat$is_doublet])
## An object of class Seurat
## 25989 features across 117565 samples within 1 assay
## Active assay: RNA (25989 features, 1984 variable features)
## 3 dimensional reductions calculated: pca, umap, harmony
Let’s assess the number of cells after QC, colored by preservation strategy (fresh/frozen):
seurat@meta.data %>%
group_by(gem_id, preservation) %>%
summarize(n_cells = n()) %>%
ggplot(aes(fct_reorder(gem_id, n_cells), n_cells, fill = preservation)) +
geom_col() +
geom_text(aes(label = n_cells), hjust = -0.1) +
ylab("Number of cells") +
theme_pubr() +
theme(axis.title.y = element_blank()) +
coord_flip()
seurat@meta.data %>%
ggplot(aes(gem_id, nFeature_RNA, fill = preservation)) +
geom_boxplot(outlier.size = 0.1) +
scale_y_continuous(
limits = c(0, 8000),
breaks = c(0, 1000, 2000, 3000, 5000, 6000, 7000, 8000)
) +
ylab("Number of detected genes") +
theme_classic() +
theme(
legend.title = element_blank(),
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
Downstream in the analysis we detected that cyropreserved (frozen) cells displayed biased proportions as compared with the freshly processed ones. Thus, we will exclude them from the atlas, as they are unreliable:
seurat
## An object of class Seurat
## 25989 features across 117565 samples within 1 assay
## Active assay: RNA (25989 features, 1984 variable features)
## 3 dimensional reductions calculated: pca, umap, harmony
(seurat <- seurat[, seurat$preservation == "fresh"])
## An object of class Seurat
## 25989 features across 58028 samples within 1 assay
## Active assay: RNA (25989 features, 1984 variable features)
## 3 dimensional reductions calculated: pca, umap, harmony
dir.create(here("scRNA-seq/results/R_objects/seurat_objects_newcastle"))
saveRDS(seurat, here("scRNA-seq/results/R_objects/seurat_objects_newcastle/1-seurat_object_newcastle_filtered.rds"))
sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /home/groups/singlecell/rmassoni/anaconda3/envs/richter/lib/libopenblasp-r0.3.21.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=es_ES.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=es_ES.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=es_ES.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] harmony_0.1.1 Rcpp_1.0.10 DT_0.27 glue_1.6.2 here_1.0.1 ggpubr_0.6.0 readxl_1.4.2 forcats_1.0.0 stringr_1.5.0 dplyr_1.1.0 purrr_1.0.1 readr_2.1.4 tidyr_1.3.0 tibble_3.1.8 ggplot2_3.4.1 tidyverse_1.3.2 scDblFinder_1.12.0 SingleCellExperiment_1.20.0 SummarizedExperiment_1.28.0 Biobase_2.58.0 GenomicRanges_1.50.0 GenomeInfoDb_1.34.9 IRanges_2.32.0 S4Vectors_0.36.0 BiocGenerics_0.44.0 MatrixGenerics_1.10.0 matrixStats_0.63.0 SeuratObject_4.1.3 Seurat_4.3.0 BiocStyle_2.26.0
##
## loaded via a namespace (and not attached):
## [1] rtracklayer_1.58.0 scattermore_0.8 R.methodsS3_1.8.2 bit64_4.0.5 knitr_1.42 R.utils_2.12.2 irlba_2.3.5.1 DelayedArray_0.24.0 data.table_1.14.6 RCurl_1.98-1.10 generics_0.1.3 ScaledMatrix_1.6.0 cowplot_1.1.1 RANN_2.6.1 future_1.31.0 bit_4.0.5 tzdb_0.3.0 spatstat.data_3.0-0 xml2_1.3.3 lubridate_1.9.1 httpuv_1.6.9 assertthat_0.2.1 viridis_0.6.2 gargle_1.3.0 xfun_0.37 hms_1.1.2 jquerylib_0.1.4 evaluate_0.20 promises_1.2.0.1 fansi_1.0.4 restfulr_0.0.15 dbplyr_2.3.0 igraph_1.3.5 DBI_1.1.3 htmlwidgets_1.6.1 spatstat.geom_3.0-6 googledrive_2.0.0 ellipsis_0.3.2 crosstalk_1.2.0 backports_1.4.1 bookdown_0.33 deldir_1.0-6 sparseMatrixStats_1.10.0 vctrs_0.5.2 ROCR_1.0-11
## [46] abind_1.4-5 cachem_1.0.6 withr_2.5.0 progressr_0.13.0 vroom_1.6.1 sctransform_0.3.5 GenomicAlignments_1.34.0 scran_1.26.2 goftest_1.2-3 cluster_2.1.4 lazyeval_0.2.2 crayon_1.5.2 spatstat.explore_3.0-6 labeling_0.4.2 edgeR_3.40.2 pkgconfig_2.0.3 nlme_3.1-162 vipor_0.4.5 rlang_1.0.6 globals_0.16.2 lifecycle_1.0.3 miniUI_0.1.1.1 modelr_0.1.10 rsvd_1.0.5 cellranger_1.1.0 rprojroot_2.0.3 polyclip_1.10-4 lmtest_0.9-40 Matrix_1.5-3 carData_3.0-5 zoo_1.8-11 reprex_2.0.2 beeswarm_0.4.0 ggridges_0.5.4 googlesheets4_1.0.1 png_0.1-8 viridisLite_0.4.1 rjson_0.2.21 bitops_1.0-7 R.oo_1.25.0 KernSmooth_2.23-20 Biostrings_2.66.0 DelayedMatrixStats_1.20.0 parallelly_1.34.0 spatstat.random_3.1-3
## [91] rstatix_0.7.2 ggsignif_0.6.4 beachmat_2.14.2 scales_1.2.1 magrittr_2.0.3 plyr_1.8.8 ica_1.0-3 zlibbioc_1.44.0 compiler_4.2.2 dqrng_0.3.0 BiocIO_1.8.0 RColorBrewer_1.1-3 fitdistrplus_1.1-8 Rsamtools_2.14.0 cli_3.6.0 XVector_0.38.0 listenv_0.9.0 patchwork_1.1.2 pbapply_1.7-0 MASS_7.3-58.2 tidyselect_1.2.0 stringi_1.7.12 highr_0.10 yaml_2.3.7 BiocSingular_1.14.0 locfit_1.5-9.7 ggrepel_0.9.3 grid_4.2.2 sass_0.4.5 tools_4.2.2 timechange_0.2.0 future.apply_1.10.0 parallel_4.2.2 bluster_1.8.0 metapod_1.6.0 gridExtra_2.3 farver_2.1.1 Rtsne_0.16 digest_0.6.31 BiocManager_1.30.19 shiny_1.7.4 car_3.1-2 broom_1.0.3 scuttle_1.8.4 later_1.3.0
## [136] RcppAnnoy_0.0.20 httr_1.4.4 colorspace_2.1-0 rvest_1.0.3 XML_3.99-0.13 fs_1.6.1 tensor_1.5 reticulate_1.26 splines_4.2.2 uwot_0.1.14 statmod_1.5.0 spatstat.utils_3.0-1 scater_1.26.1 sp_1.6-0 xgboost_1.7.5.1 plotly_4.10.1 xtable_1.8-4 jsonlite_1.8.4 R6_2.5.1 pillar_1.8.1 htmltools_0.5.4 mime_0.12 fastmap_1.1.0 BiocParallel_1.32.5 BiocNeighbors_1.16.0 codetools_0.2-19 utf8_1.2.3 lattice_0.20-45 bslib_0.4.2 spatstat.sparse_3.0-0 ggbeeswarm_0.7.1 leiden_0.4.3 survival_3.5-3 limma_3.54.0 rmarkdown_2.20 munsell_0.5.0 GenomeInfoDbData_1.2.9 haven_2.5.1 reshape2_1.4.4 gtable_0.3.1